In [3]:
import pandas as pd
import plotly.express as px

# Load the dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")

# Select categorical variables that are meaningful
categorical_columns = ["workclass", "education", "marital-status", "occupation", "relationship"]

# Ensure the selected columns exist in the dataset
cat_vars = df[categorical_columns]

# Compute proportions (percentage of most frequent category for each variable)
cat_summary = cat_vars.apply(lambda x: x.value_counts(normalize=True).max() * 100)

# Convert to DataFrame for radar chart
df_radar = pd.DataFrame(dict(
    r=cat_summary.values,   # Proportions as percentages
    theta=cat_summary.index # Category names
))

# Generate radar chart using Plotly
fig = px.line_polar(df_radar, r='r', theta='theta', line_close=True, 
                     title="Radar Chart of Categorical Variables in Adult Dataset")

# Show the plot
fig.show()
In [4]:
#Question 6. Construct a web graph (or spider graph) of the categorical variables. Fine-tune the graph so that interesting results emerge. Discuss your findings.

import pandas as pd
import plotly.express as px

# Load the dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")

# Select categorical variables of interest
categorical_columns = ["workclass", "education", "marital-status", "occupation", "relationship"]

# Generate radar charts for each categorical variable
for category in categorical_columns:
    # Compute category proportions (percentage of occurrences)
    cat_counts = df[category].value_counts(normalize=True) * 100
    
    # Create DataFrame for radar chart
    df_radar = pd.DataFrame(dict(
        r=cat_counts.values,  # Percentage values
        theta=cat_counts.index  # Category names
    ))
    
    # Generate radar chart
    fig = px.line_polar(df_radar, r='r', theta='theta', line_close=True, 
                         title=f"Radar Chart for {category.capitalize()}",
                         markers=True)
    
    # Show the plot
    fig.show()
In [1]:
#Question 9. Construct a histogram of each numerical variables, with an overlay of the target variable income.

Normalize if necessary.
import pandas as pd
import plotly.express as px

# Load the dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")

# Select numerical variables (excluding target variable 'income')
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
target_variable = "income"

# Check if the target variable exists in the dataset
if target_variable not in df.columns:
    raise ValueError(f"Target variable '{target_variable}' not found in dataset!")

# Normalize numerical variables (optional)
df_normalized = df.copy()
for col in numerical_columns:
    df_normalized[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

# Generate histograms for each numerical variable with income overlay
for col in numerical_columns:
    fig = px.histogram(df, x=col, color=target_variable, 
                       title=f"Histogram of {col} with Income Overlay",
                       nbins=30, barmode="overlay", opacity=0.7)
    fig.show()
In [2]:
#Question 10 For each pair of numerical variables, construct a scatter plot of the variables. Discuss your salient results.

import pandas as pd
import plotly.express as px
import itertools  # To generate variable pairs

# Load dataset
df = pd.read_csv("C:/Users/Kadeem Green/Downloads/Adult.txt")

# Select numerical variables
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()

# Generate scatter plots for each pair of numerical variables
for var1, var2 in itertools.combinations(numerical_columns, 2):  # Creates unique pairs of numerical variables
    fig = px.scatter(df, x=var1, y=var2, 
                     title=f"Scatter Plot of {var1} vs {var2}",
                     labels={var1: var1, var2: var2},
                     color="income",  # Overlay income for comparison
                     opacity=0.7)
    fig.show()
In [ ]: